library(tidyverse)
library(glmnet)
library(caret)
library(pROC)
library(VIM)
#library(performanceEstimation)
#library(mlr)
#library(UBL)
dat <- read.csv("/Users/yctang/Documents/Columbia/5291 Advanced data Analysis/project/data/hmda_2017_ny_all-records_labels.csv")
# remove some variables & NA
dat <- subset(dat, select = -c(1:4, 6, 8, 10, 12, 15, 17, 19:27, 29, 31, 33:50, 51, 53, 54, 56:71, 69:71, 78))
#aggr(dat)
Data Pre-processing
# drop missing values
dat <- drop_na(dat)
# some adjustments: re-encoding
# owner_occupancy: other == 0, Owner-occupied as a principal dwelling == 1
dat[dat$owner_occupancy == 2, ]$owner_occupancy <- 0
dat[dat$owner_occupancy == 3, ]$owner_occupancy <- 0
# loan_type: other == 0, Conventional == 1
dat[dat$loan_type == 2, ]$loan_type <- 0
dat[dat$loan_type == 3, ]$loan_type <- 0
dat[dat$loan_type == 4, ]$loan_type <- 0
# preapproval: not requested == 0
dat[dat$preapproval == 2, ]$preapproval <- 0
dat[dat$preapproval == 3, ]$preapproval <- 0
# action_taken
dat = dat %>% filter(action_taken == 1 | action_taken == 2 | action_taken == 3)
dat[dat$action_taken == 2, ]$action_taken <- 1
dat[dat$action_taken == 3, ]$action_taken <- 0
# applicant_ethnicity: other == 0, hispanic/latino == 1
dat[dat$applicant_ethnicity == 2, ]$applicant_ethnicity <- 0
dat[dat$applicant_ethnicity == 3, ]$applicant_ethnicity <- 0
dat[dat$applicant_ethnicity == 4, ]$applicant_ethnicity <- 0
# sex: unknown == 0, male == 1, female == 2
dat[dat$applicant_sex == 3, ]$applicant_sex <- 0
dat[dat$applicant_sex == 4, ]$applicant_sex <- 0
# race: other == 0, asian == 1, Black or African American == 2, white == 3
dat[dat$applicant_race_1 == 1, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 4, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 6, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 7, ]$applicant_race_1 <- 0
dat[dat$applicant_race_1 == 2, ]$applicant_race_1 <- 1
dat[dat$applicant_race_1 == 3, ]$applicant_race_1 <- 2
dat[dat$applicant_race_1 == 5, ]$applicant_race_1 <- 3
names(dat)[names(dat) == "applicant_race_1"] <- "applicant_race"
# co-applicant, yes/no == 1/0
dat[dat$co_applicant_ethnicity == 2, ]$co_applicant_ethnicity <- 1
dat[dat$co_applicant_ethnicity == 3, ]$co_applicant_ethnicity <- 0
dat[dat$co_applicant_ethnicity == 4, ]$co_applicant_ethnicity <- 0
dat[dat$co_applicant_ethnicity == 5, ]$co_applicant_ethnicity <- 0
names(dat)[names(dat) == "co_applicant_ethnicity"] <- "co_applicant"
# Change classes of variables
dat$agency_code <- as.factor(dat$agency_code)
dat$loan_type <- as.factor(dat$loan_type)
dat$property_type <- as.factor(dat$property_type)
dat$loan_purpose <- as.factor(dat$loan_purpose)
dat$owner_occupancy <- as.factor(dat$owner_occupancy)
dat$preapproval <- as.factor(dat$preapproval)
dat$action_taken <- as.factor(dat$action_taken)
dat$applicant_ethnicity <- as.factor(dat$applicant_ethnicity)
dat$co_applicant <- as.factor(dat$co_applicant)
dat$applicant_race <- as.factor(dat$applicant_race)
dat$applicant_sex <- as.factor(dat$applicant_sex)
Data Visualization
dat %>%
ggplot(aes(action_taken, fill = action_taken)) +
geom_bar()+
scale_y_continuous(labels = scales::percent)

#ggplot(dat, aes(x = action_taken)) +
# geom_bar(aes(y = (..count..)/sum(..count..))) +
# scale_y_continuous(formatter = 'percent')
Training-testing Set Split
set.seed(123456)
indices.old <- sample(1:nrow(dat), nrow(dat) * 0.7)
training.old <- dat[indices.old, ]
testing.old <- dat[-indices.old, ]
First Try: Logistic Regression
full <- glm(action_taken ~., family = binomial(link = 'logit'), data = training.old)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#summary(fit)
test.prob <- predict(full, testing.old, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing.old$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1803 1485
## 1 17168 69700
##
## Accuracy : 0.7931
## 95% CI : (0.7904, 0.7957)
## No Information Rate : 0.7896
## P-Value [Acc > NIR] : 0.004678
##
## Kappa : 0.1065
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.97914
## Specificity : 0.09504
## Pos Pred Value : 0.80237
## Neg Pred Value : 0.54836
## Prevalence : 0.78958
## Detection Rate : 0.77310
## Detection Prevalence : 0.96353
## Balanced Accuracy : 0.53709
##
## 'Positive' Class : 1
##
test.roc <- roc(testing.old$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Outliers, standardization and other adjustments
# outliers & standardization
hist(dat$applicant_income_000s)

plot(density(dat$population[dat$population < quantile(dat$population, 0.99)]))
lines(density(dat$applicant_income_000s[dat$applicant_income_000s < quantile(dat$applicant_income_000s, 0.99)]))

dat <- dat %>%
filter(applicant_income_000s < quantile(applicant_income_000s, 0.96)) %>%
filter(loan_amount_000s < quantile(loan_amount_000s, 0.96)) %>%
mutate(applicant_income = as.numeric(scale(applicant_income_000s, center = FALSE)),
loan_amount = as.numeric(scale(loan_amount_000s, center = FALSE)),
population = as.numeric(scale(population, center = FALSE)),
hud_median_family_income = as.numeric(scale(hud_median_family_income, center = FALSE)),
number_of_owner_occupied_units = as.numeric(scale(number_of_owner_occupied_units, center = FALSE)),
number_of_1_to_4_family_units = as.numeric(scale(number_of_1_to_4_family_units, center = FALSE)),
minority_population = minority_population / 100,
tract_to_msamd_income = tract_to_msamd_income / 100
) %>%
dplyr::select(-c(applicant_income_000s, loan_amount_000s)) %>%
dplyr::select(action_taken, everything())
hist(dat$applicant_income)

plot(density(dat$population))
lines(density(dat$applicant_income))

plot(density(dat$population[dat$population<quantile(dat$population, 0.99)]))
lines(density(dat$applicant_income[dat$applicant_income<quantile(dat$applicant_income, 0.99)]))

Second Try: Logistic Regression
set.seed(123456)
dat <- sample_n(dat, 1000)
indices <- sample(1:nrow(dat), nrow(dat) * 0.7)
training <- dat[indices, ]
testing <- dat[-indices, ]
full <- glm(action_taken ~., family = binomial(link = 'logit'), data = training)
#summary(fit)
test.prob <- predict(full, testing, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 8 16
## 1 52 224
##
## Accuracy : 0.7733
## 95% CI : (0.7217, 0.8195)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 0.8888
##
## Kappa : 0.086
##
## Mcnemar's Test P-Value : 2.192e-05
##
## Sensitivity : 0.9333
## Specificity : 0.1333
## Pos Pred Value : 0.8116
## Neg Pred Value : 0.3333
## Prevalence : 0.8000
## Detection Rate : 0.7467
## Detection Prevalence : 0.9200
## Balanced Accuracy : 0.5333
##
## 'Positive' Class : 1
##
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

SMOTE & L1 Regularization
# install the RSBID package
#install.packages("devtools")
#devtools::install_github("dongyuanwu/RSBID")
library(RSBID)
## Loading required package: FNN
## Loading required package: clustMixType
## Loading required package: klaR
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#training.dummy$action_taken1 <- as.factor(training.dummy$action_taken1)
#testing.dummy$action_taken1 <- as.factor(testing.dummy$action_taken1)
# too slow! sample 50000 for testing
#set.seed(123456)
#training <- sample_n(training, 50000)
ptm <- proc.time()
training.bal <- SMOTE_NC(training, "action_taken")
## Variables are continous and categorical, SMOTE_NC could be used.
proc.time() - ptm # running time
## user system elapsed
## 11.882 0.088 12.167
training.bal.dummy <- data.frame(model.matrix( ~ ., training.bal)[, -1])
testing.dummy <- data.frame(model.matrix( ~ ., testing)[, -1])
X <- as.matrix(training.bal.dummy[-1])
Y <- training.bal.dummy$action_taken1
cv <- cv.glmnet(X, Y, family = "binomial")
fit.L1 <- glmnet(X, Y, family = "binomial", alpha = 1, lambda = cv$lambda.min)
# confusion matrix and roc curve
test.prob <- fit.L1 %>% predict(newx = as.matrix(testing.dummy[-1]))
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
mean(test.pred == testing.dummy$action_taken1)
## [1] 0.5766667
confusionMatrix(data = as.factor(test.pred), reference = factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 40 107
## 1 20 133
##
## Accuracy : 0.5767
## 95% CI : (0.5186, 0.6332)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.143
##
## Mcnemar's Test P-Value : 2.325e-14
##
## Sensitivity : 0.5542
## Specificity : 0.6667
## Pos Pred Value : 0.8693
## Neg Pred Value : 0.2721
## Prevalence : 0.8000
## Detection Rate : 0.4433
## Detection Prevalence : 0.5100
## Balanced Accuracy : 0.6104
##
## 'Positive' Class : 1
##
test.roc <- roc(testing.dummy$action_taken1 ~ as.numeric(test.prob), plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

乱七八糟瞎try
Try: Neural Network
library(neuralnet)
ptm <- proc.time()
set.seed(123456)
NN = neuralnet(action_taken1 ~ ., training.bal.dummy, hidden = 5, linear.output = FALSE, err.fct = 'ce', stepmax = 1e7)
proc.time() - ptm # running time
## user system elapsed
## 85.252 2.654 89.757
plot(NN)
predict_NN = compute(NN, testing.dummy[-1])
test.pred <- as.numeric(ifelse(predict_NN$net.result > 0.5, 1, 0))
mean(test.pred == testing.dummy$action_taken1)
## [1] 0.6166667
confusionMatrix(data = as.factor(test.pred), reference = factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 25 80
## 1 35 160
##
## Accuracy : 0.6167
## 95% CI : (0.559, 0.672)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.065
##
## Mcnemar's Test P-Value : 4.078e-05
##
## Sensitivity : 0.6667
## Specificity : 0.4167
## Pos Pred Value : 0.8205
## Neg Pred Value : 0.2381
## Prevalence : 0.8000
## Detection Rate : 0.5333
## Detection Prevalence : 0.6500
## Balanced Accuracy : 0.5417
##
## 'Positive' Class : 1
##
test.roc <- roc(testing.dummy$action_taken1 ~ predict_NN$net.result, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
PCR ???
#library(pls)
#use model to make predictions on a test set
#pcr_model <- pcr(action_taken1 ~ ., data = training.bal.dummy, validation = "CV")
#summary(pcr_model)
#pcr_pred <- predict(pcr_model, as.matrix(testing.dummy[-1]), ncomp = 2)
#calculate RMSE
#sqrt(mean((pcr_pred - testing.dummy$action_taken1)^2))
#test.prob.pcr <- pcr_model %>% predict(newx = as.matrix(testing.dummy[-1]))
#test.pred.pcr <- as.numeric(ifelse(pcr_pred > 0.5, 1, 0))
#confusionMatrix(data = as.factor(test.pred.pcr), reference = as.factor(testing.dummy$action_taken1), positive = "1")
#test.pcr.roc <- roc(testing.dummy$action_taken1 ~ as.numeric(pcr_pred), plot = TRUE, print.auc = TRUE)
FAMD
library(FactoMineR)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
options(ggrepel.max.overlaps = Inf)
# unbalanced
famd1 <- FAMD(training[, -1], ncp = 8)





training.famd <- data.frame(famd1$ind$coord)
training.famd$action_taken <- training$action_taken
testing.famd <- data.frame(predict.FAMD(famd1, testing[, -1])$coord)
testing.famd$action_taken <- testing$action_taken
#balanced
famd2 <- FAMD(training.bal[, -1], ncp = 8)





training.bal.famd <- data.frame(famd2$ind$coord)
training.bal.famd$action_taken <- training.bal$action_taken
testing.bal.famd <- data.frame(predict.FAMD(famd2, testing[, -1])$coord)
testing.bal.famd$action_taken <- testing$action_taken
# FAMD unbalanced training set
full <- glm(action_taken ~ ., family = binomial(link = "logit"), data = training.famd)
test.prob <- predict(full, testing.famd, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1 2
## 1 59 238
##
## Accuracy : 0.7967
## 95% CI : (0.7466, 0.8407)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 0.591
##
## Kappa : 0.0129
##
## Mcnemar's Test P-Value : 7.496e-13
##
## Sensitivity : 0.99167
## Specificity : 0.01667
## Pos Pred Value : 0.80135
## Neg Pred Value : 0.33333
## Prevalence : 0.80000
## Detection Rate : 0.79333
## Detection Prevalence : 0.99000
## Balanced Accuracy : 0.50417
##
## 'Positive' Class : 1
##
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# FAMD balanced training set
full <- glm(action_taken ~ ., family = binomial(link = "logit"), data = training.bal.famd)
test.prob <- predict(full, testing.bal.famd, type = "response")
test.pred <- as.numeric(ifelse(test.prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 27 76
## 1 33 164
##
## Accuracy : 0.6367
## 95% CI : (0.5794, 0.6912)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1051
##
## Mcnemar's Test P-Value : 5.749e-05
##
## Sensitivity : 0.6833
## Specificity : 0.4500
## Pos Pred Value : 0.8325
## Neg Pred Value : 0.2621
## Prevalence : 0.8000
## Detection Rate : 0.5467
## Detection Prevalence : 0.6567
## Balanced Accuracy : 0.5667
##
## 'Positive' Class : 1
##
test.roc <- roc(testing$action_taken ~ test.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# class of label/dependent variables should be `factor` -> classification
#training.bal.dummy$action_taken1 <- as.factor(training.bal.dummy$action_taken1)
rf_model <- randomForest(action_taken ~ ., data = training.bal, proximity = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = action_taken ~ ., data = training.bal, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 17.44%
## Confusion matrix:
## 0 1 class.error
## 0 445 91 0.1697761
## 1 96 440 0.1791045
# type = "prob" -> gets the probability
rf_prob <- predict(rf_model, testing, type = "prob")[, 2]
#calculate RMSE
#sqrt(mean((rf_pred - testing.dummy$action_taken1)^2))
test.pred.rf <- as.numeric(ifelse(rf_prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred.rf), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 19 51
## 1 41 189
##
## Accuracy : 0.6933
## 95% CI : (0.6378, 0.745)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.098
##
## Mcnemar's Test P-Value : 0.3481
##
## Sensitivity : 0.7875
## Specificity : 0.3167
## Pos Pred Value : 0.8217
## Neg Pred Value : 0.2714
## Prevalence : 0.8000
## Detection Rate : 0.6300
## Detection Prevalence : 0.7667
## Balanced Accuracy : 0.5521
##
## 'Positive' Class : 1
##
test.rf.roc <- roc(testing$action_taken ~ rf_prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# ROC for training set
#rf.roc <- roc(training.bal$action_taken, rf_model$votes[, 2], plot = TRUE, print.auc = TRUE)
# RandomForest based on famd balanced data
rf_model2 <- randomForest(action_taken ~ ., data = training.bal.famd, proximity = TRUE)
print(rf_model2)
##
## Call:
## randomForest(formula = action_taken ~ ., data = training.bal.famd, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 23.79%
## Confusion matrix:
## 0 1 class.error
## 0 405 131 0.2444030
## 1 124 412 0.2313433
rf_prob <- predict(rf_model2, testing.bal.famd, type = "prob")[, 2]
test.pred.rf <- as.numeric(ifelse(rf_prob > 0.5, 1, 0))
confusionMatrix(data = as.factor(test.pred.rf), reference = testing$action_taken, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21 54
## 1 39 186
##
## Accuracy : 0.69
## 95% CI : (0.6343, 0.7419)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.1143
##
## Mcnemar's Test P-Value : 0.1466
##
## Sensitivity : 0.7750
## Specificity : 0.3500
## Pos Pred Value : 0.8267
## Neg Pred Value : 0.2800
## Prevalence : 0.8000
## Detection Rate : 0.6200
## Detection Prevalence : 0.7500
## Balanced Accuracy : 0.5625
##
## 'Positive' Class : 1
##
test.rf.roc <- roc(testing$action_taken ~ rf_prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

XGBOOST
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
testing_label <- testing[, 1] #define testing label(copy from below lines)
training_sparse <- sparse.model.matrix(action_taken ~ . - 1, training.bal)
training_label <- training.bal[, 1]
train_matrix <- xgb.DMatrix(data = training_sparse, label = as.numeric(training_label) - 1)
testing_sparse <- sparse.model.matrix(action_taken ~ . - 1, testing)
test_matrix <- xgb.DMatrix(data = testing_sparse, label = as.numeric(testing_label) - 1)
params <- list(booster = "gbtree", objective = "binary:logistic", eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1, subsample = 1, colsample_bytree = 1)
xgb.cv <- xgb.cv(params = params, data = train_matrix, nfold = 5, nrounds = 100)
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:47] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1] train-logloss:0.575328+0.002984 test-logloss:0.615247+0.012894
## [2] train-logloss:0.501930+0.005430 test-logloss:0.569148+0.015696
## [3] train-logloss:0.444070+0.006646 test-logloss:0.542786+0.024643
## [4] train-logloss:0.403595+0.012812 test-logloss:0.522496+0.019828
## [5] train-logloss:0.369982+0.014294 test-logloss:0.511582+0.020290
## [6] train-logloss:0.345851+0.011975 test-logloss:0.504394+0.023060
## [7] train-logloss:0.324557+0.013860 test-logloss:0.497223+0.027897
## [8] train-logloss:0.307368+0.016076 test-logloss:0.486923+0.028678
## [9] train-logloss:0.293764+0.016048 test-logloss:0.485366+0.029859
## [10] train-logloss:0.279038+0.017221 test-logloss:0.484922+0.031905
## [11] train-logloss:0.263453+0.013164 test-logloss:0.478519+0.034179
## [12] train-logloss:0.251357+0.016550 test-logloss:0.477193+0.033905
## [13] train-logloss:0.241299+0.014537 test-logloss:0.472791+0.033833
## [14] train-logloss:0.230877+0.015272 test-logloss:0.473832+0.037299
## [15] train-logloss:0.222188+0.016450 test-logloss:0.471588+0.038575
## [16] train-logloss:0.213416+0.015626 test-logloss:0.469947+0.040920
## [17] train-logloss:0.203699+0.012803 test-logloss:0.468465+0.045120
## [18] train-logloss:0.195589+0.013835 test-logloss:0.468523+0.046560
## [19] train-logloss:0.189764+0.014508 test-logloss:0.468528+0.046030
## [20] train-logloss:0.180556+0.014581 test-logloss:0.466738+0.046268
## [21] train-logloss:0.174003+0.014733 test-logloss:0.465854+0.047783
## [22] train-logloss:0.168908+0.015769 test-logloss:0.463761+0.046592
## [23] train-logloss:0.163050+0.017333 test-logloss:0.462026+0.046306
## [24] train-logloss:0.155722+0.017056 test-logloss:0.458481+0.043100
## [25] train-logloss:0.151106+0.017426 test-logloss:0.458941+0.042036
## [26] train-logloss:0.146893+0.018231 test-logloss:0.458409+0.045516
## [27] train-logloss:0.140881+0.016855 test-logloss:0.459703+0.045748
## [28] train-logloss:0.135411+0.014836 test-logloss:0.455239+0.044243
## [29] train-logloss:0.132932+0.014751 test-logloss:0.455812+0.044813
## [30] train-logloss:0.127719+0.013357 test-logloss:0.453189+0.044798
## [31] train-logloss:0.123196+0.013099 test-logloss:0.450902+0.044518
## [32] train-logloss:0.119075+0.013502 test-logloss:0.450034+0.045058
## [33] train-logloss:0.114776+0.012050 test-logloss:0.451793+0.044426
## [34] train-logloss:0.109532+0.010637 test-logloss:0.450099+0.044999
## [35] train-logloss:0.105817+0.011119 test-logloss:0.450079+0.043500
## [36] train-logloss:0.102665+0.011020 test-logloss:0.451424+0.045060
## [37] train-logloss:0.098994+0.010073 test-logloss:0.449638+0.047872
## [38] train-logloss:0.096057+0.009571 test-logloss:0.449446+0.047866
## [39] train-logloss:0.093768+0.010869 test-logloss:0.450172+0.049167
## [40] train-logloss:0.090273+0.009789 test-logloss:0.451410+0.049300
## [41] train-logloss:0.087471+0.009941 test-logloss:0.451510+0.051148
## [42] train-logloss:0.084873+0.010605 test-logloss:0.451232+0.048054
## [43] train-logloss:0.082608+0.010582 test-logloss:0.451934+0.048905
## [44] train-logloss:0.080523+0.009924 test-logloss:0.452319+0.048454
## [45] train-logloss:0.078267+0.009568 test-logloss:0.455123+0.047856
## [46] train-logloss:0.076320+0.008972 test-logloss:0.454527+0.046549
## [47] train-logloss:0.073697+0.008706 test-logloss:0.454017+0.047504
## [48] train-logloss:0.071221+0.007831 test-logloss:0.452042+0.048680
## [49] train-logloss:0.069412+0.008170 test-logloss:0.450622+0.048321
## [50] train-logloss:0.067979+0.008015 test-logloss:0.450975+0.048672
## [51] train-logloss:0.066185+0.008269 test-logloss:0.452030+0.047807
## [52] train-logloss:0.064518+0.008413 test-logloss:0.453741+0.048108
## [53] train-logloss:0.062381+0.007959 test-logloss:0.452425+0.049620
## [54] train-logloss:0.061182+0.007974 test-logloss:0.453394+0.049104
## [55] train-logloss:0.059464+0.007872 test-logloss:0.453838+0.050617
## [56] train-logloss:0.057926+0.007693 test-logloss:0.454876+0.050739
## [57] train-logloss:0.056849+0.007496 test-logloss:0.454179+0.051505
## [58] train-logloss:0.055563+0.007523 test-logloss:0.454796+0.050938
## [59] train-logloss:0.054687+0.007520 test-logloss:0.455452+0.052946
## [60] train-logloss:0.053337+0.007451 test-logloss:0.455313+0.054009
## [61] train-logloss:0.052165+0.007149 test-logloss:0.454925+0.054293
## [62] train-logloss:0.050944+0.006948 test-logloss:0.456024+0.054014
## [63] train-logloss:0.049851+0.006989 test-logloss:0.457142+0.053746
## [64] train-logloss:0.048811+0.006789 test-logloss:0.456656+0.054508
## [65] train-logloss:0.047653+0.006435 test-logloss:0.456543+0.054898
## [66] train-logloss:0.046775+0.006389 test-logloss:0.458538+0.055702
## [67] train-logloss:0.045698+0.006298 test-logloss:0.459710+0.055899
## [68] train-logloss:0.044676+0.006100 test-logloss:0.460719+0.056485
## [69] train-logloss:0.043880+0.006006 test-logloss:0.461375+0.058295
## [70] train-logloss:0.043038+0.005914 test-logloss:0.463882+0.060501
## [71] train-logloss:0.042217+0.005677 test-logloss:0.464632+0.060130
## [72] train-logloss:0.041275+0.005421 test-logloss:0.465915+0.060754
## [73] train-logloss:0.040403+0.005369 test-logloss:0.465292+0.060819
## [74] train-logloss:0.039618+0.005264 test-logloss:0.465989+0.060903
## [75] train-logloss:0.038918+0.005151 test-logloss:0.465025+0.060670
## [76] train-logloss:0.038189+0.005027 test-logloss:0.465463+0.061083
## [77] train-logloss:0.037413+0.004810 test-logloss:0.467112+0.061760
## [78] train-logloss:0.036660+0.004687 test-logloss:0.468336+0.060842
## [79] train-logloss:0.035989+0.004428 test-logloss:0.468957+0.061755
## [80] train-logloss:0.035332+0.004312 test-logloss:0.468916+0.062188
## [81] train-logloss:0.034837+0.004255 test-logloss:0.470496+0.061811
## [82] train-logloss:0.034178+0.004081 test-logloss:0.469951+0.062935
## [83] train-logloss:0.033736+0.004036 test-logloss:0.469360+0.061852
## [84] train-logloss:0.033133+0.003929 test-logloss:0.469082+0.062120
## [85] train-logloss:0.032577+0.003808 test-logloss:0.469844+0.063032
## [86] train-logloss:0.032004+0.003683 test-logloss:0.470098+0.063230
## [87] train-logloss:0.031646+0.003724 test-logloss:0.471558+0.063476
## [88] train-logloss:0.031281+0.003732 test-logloss:0.472890+0.063665
## [89] train-logloss:0.030859+0.003643 test-logloss:0.473153+0.063413
## [90] train-logloss:0.030441+0.003599 test-logloss:0.472045+0.063890
## [91] train-logloss:0.030058+0.003507 test-logloss:0.473470+0.063892
## [92] train-logloss:0.029588+0.003398 test-logloss:0.473202+0.063907
## [93] train-logloss:0.029167+0.003299 test-logloss:0.474970+0.063640
## [94] train-logloss:0.028801+0.003199 test-logloss:0.475443+0.064750
## [95] train-logloss:0.028409+0.003156 test-logloss:0.476730+0.065356
## [96] train-logloss:0.028040+0.003127 test-logloss:0.476242+0.065973
## [97] train-logloss:0.027681+0.003067 test-logloss:0.476400+0.065921
## [98] train-logloss:0.027268+0.002946 test-logloss:0.476300+0.066208
## [99] train-logloss:0.026913+0.002930 test-logloss:0.476575+0.067105
## [100] train-logloss:0.026521+0.002826 test-logloss:0.477062+0.067162
##best iteration =
xgb1 <- xgb.train(params = params, data = train_matrix, nrounds = 100, watchlist = list(val = test_matrix, train=train_matrix), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "error")
## [23:01:47] WARNING: amalgamation/../src/learner.cc:576:
## Parameters: { "early_stop_round" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
##
##
## [1] val-error:0.353333 train-error:0.175373
## [11] val-error:0.313333 train-error:0.061567
## [21] val-error:0.300000 train-error:0.021455
## [31] val-error:0.303333 train-error:0.009328
## [41] val-error:0.310000 train-error:0.001866
## [51] val-error:0.303333 train-error:0.000000
## [61] val-error:0.303333 train-error:0.000000
## [71] val-error:0.310000 train-error:0.000000
## [81] val-error:0.303333 train-error:0.000000
## [91] val-error:0.296667 train-error:0.000000
## [100] val-error:0.290000 train-error:0.000000
xgb.prob <- predict(xgb1, test_matrix)
xgb.pred <- ifelse (xgb.prob > 0.5, 1, 0)
confusionMatrix(data = as.factor(xgb.pred), reference = testing_label, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21 48
## 1 39 192
##
## Accuracy : 0.71
## 95% CI : (0.6551, 0.7607)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 0.9999
##
## Kappa : 0.142
##
## Mcnemar's Test P-Value : 0.3911
##
## Sensitivity : 0.8000
## Specificity : 0.3500
## Pos Pred Value : 0.8312
## Neg Pred Value : 0.3043
## Prevalence : 0.8000
## Detection Rate : 0.6400
## Detection Prevalence : 0.7700
## Balanced Accuracy : 0.5750
##
## 'Positive' Class : 1
##
test.rf.roc <- roc(testing$action_taken ~ xgb.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

mat1 <- xgb.importance(feature_names = colnames(training_sparse), model = xgb1)
xgb.plot.importance(importance_matrix = mat1)

# xgboost based on famd balanced training data
training_sparse <- sparse.model.matrix(action_taken ~ . - 1, training.bal.famd)
training_label <- training.bal[, 1]
train_matrix <- xgb.DMatrix(data = training_sparse, label = as.numeric(training_label) - 1)
testing_sparse <- sparse.model.matrix(action_taken ~ . - 1, testing.bal.famd)
testing_label <- testing[, 1]
test_matrix <- xgb.DMatrix(data = testing_sparse, label = as.numeric(testing_label) - 1)
params <- list(booster = "gbtree", objective = "binary:logistic", eta = 0.3, gamma = 0, max_depth = 6, min_child_weight = 1, subsample = 1, colsample_bytree = 1)
xgb.cv <- xgb.cv(params = params, data = train_matrix, nfold = 5, nrounds = 100)
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [23:01:48] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
## [1] train-logloss:0.585364+0.005323 test-logloss:0.641555+0.012803
## [2] train-logloss:0.510910+0.004971 test-logloss:0.610823+0.022065
## [3] train-logloss:0.456689+0.008054 test-logloss:0.592870+0.031595
## [4] train-logloss:0.416464+0.007748 test-logloss:0.583032+0.037100
## [5] train-logloss:0.385563+0.010093 test-logloss:0.579411+0.042217
## [6] train-logloss:0.363716+0.010632 test-logloss:0.574218+0.045401
## [7] train-logloss:0.343054+0.012408 test-logloss:0.573916+0.044699
## [8] train-logloss:0.321494+0.010599 test-logloss:0.568858+0.043291
## [9] train-logloss:0.308004+0.013985 test-logloss:0.567986+0.042024
## [10] train-logloss:0.294271+0.010289 test-logloss:0.568300+0.046385
## [11] train-logloss:0.280777+0.014025 test-logloss:0.565394+0.046174
## [12] train-logloss:0.265999+0.014082 test-logloss:0.566940+0.048169
## [13] train-logloss:0.252873+0.013566 test-logloss:0.570085+0.052022
## [14] train-logloss:0.241129+0.011083 test-logloss:0.570801+0.055336
## [15] train-logloss:0.230598+0.008013 test-logloss:0.567980+0.057385
## [16] train-logloss:0.223197+0.009210 test-logloss:0.565542+0.057120
## [17] train-logloss:0.213887+0.008812 test-logloss:0.565326+0.059599
## [18] train-logloss:0.206797+0.009409 test-logloss:0.565711+0.058902
## [19] train-logloss:0.197965+0.010242 test-logloss:0.563028+0.061834
## [20] train-logloss:0.188649+0.008737 test-logloss:0.565524+0.062479
## [21] train-logloss:0.183562+0.009714 test-logloss:0.569271+0.063995
## [22] train-logloss:0.176623+0.010398 test-logloss:0.570358+0.062927
## [23] train-logloss:0.170337+0.010265 test-logloss:0.571165+0.064548
## [24] train-logloss:0.163256+0.007652 test-logloss:0.571667+0.063890
## [25] train-logloss:0.159203+0.006868 test-logloss:0.570952+0.064961
## [26] train-logloss:0.155216+0.006977 test-logloss:0.572274+0.064570
## [27] train-logloss:0.148178+0.006927 test-logloss:0.572924+0.062525
## [28] train-logloss:0.143701+0.006058 test-logloss:0.574565+0.062850
## [29] train-logloss:0.138450+0.006267 test-logloss:0.574484+0.062575
## [30] train-logloss:0.134303+0.007769 test-logloss:0.576855+0.064248
## [31] train-logloss:0.130123+0.007243 test-logloss:0.577763+0.066619
## [32] train-logloss:0.125050+0.007921 test-logloss:0.578214+0.066727
## [33] train-logloss:0.119356+0.007601 test-logloss:0.576790+0.068997
## [34] train-logloss:0.114638+0.006381 test-logloss:0.578047+0.067789
## [35] train-logloss:0.110893+0.006411 test-logloss:0.579404+0.067475
## [36] train-logloss:0.107472+0.006687 test-logloss:0.581695+0.067379
## [37] train-logloss:0.103749+0.006775 test-logloss:0.583742+0.068978
## [38] train-logloss:0.100555+0.006849 test-logloss:0.582975+0.070291
## [39] train-logloss:0.097605+0.007034 test-logloss:0.583663+0.071660
## [40] train-logloss:0.094639+0.007080 test-logloss:0.584720+0.072417
## [41] train-logloss:0.092086+0.006369 test-logloss:0.584141+0.073535
## [42] train-logloss:0.089178+0.006150 test-logloss:0.585927+0.073351
## [43] train-logloss:0.086956+0.006941 test-logloss:0.588143+0.073289
## [44] train-logloss:0.084385+0.006469 test-logloss:0.588597+0.074288
## [45] train-logloss:0.082045+0.006278 test-logloss:0.590760+0.076168
## [46] train-logloss:0.080451+0.006298 test-logloss:0.591831+0.076982
## [47] train-logloss:0.078084+0.005693 test-logloss:0.593621+0.078230
## [48] train-logloss:0.076211+0.005531 test-logloss:0.595418+0.080243
## [49] train-logloss:0.074120+0.005305 test-logloss:0.595750+0.080684
## [50] train-logloss:0.072236+0.005353 test-logloss:0.596068+0.082646
## [51] train-logloss:0.070609+0.005514 test-logloss:0.597855+0.083252
## [52] train-logloss:0.068738+0.005227 test-logloss:0.601925+0.082556
## [53] train-logloss:0.067056+0.004848 test-logloss:0.602203+0.083003
## [54] train-logloss:0.065508+0.004865 test-logloss:0.602596+0.083258
## [55] train-logloss:0.063658+0.004763 test-logloss:0.605338+0.083968
## [56] train-logloss:0.061651+0.004266 test-logloss:0.607431+0.081741
## [57] train-logloss:0.060358+0.004081 test-logloss:0.609811+0.082161
## [58] train-logloss:0.059187+0.003809 test-logloss:0.611453+0.082976
## [59] train-logloss:0.057923+0.003669 test-logloss:0.612158+0.082891
## [60] train-logloss:0.056807+0.003516 test-logloss:0.615115+0.084260
## [61] train-logloss:0.055713+0.003565 test-logloss:0.615966+0.085714
## [62] train-logloss:0.054515+0.003650 test-logloss:0.618053+0.086857
## [63] train-logloss:0.053397+0.003696 test-logloss:0.618726+0.086661
## [64] train-logloss:0.052119+0.003733 test-logloss:0.620697+0.088931
## [65] train-logloss:0.051176+0.003463 test-logloss:0.621296+0.088552
## [66] train-logloss:0.050049+0.003325 test-logloss:0.622687+0.089402
## [67] train-logloss:0.048811+0.003066 test-logloss:0.624679+0.090095
## [68] train-logloss:0.047778+0.002979 test-logloss:0.626513+0.091374
## [69] train-logloss:0.046798+0.002911 test-logloss:0.628137+0.091883
## [70] train-logloss:0.045988+0.002848 test-logloss:0.629469+0.091519
## [71] train-logloss:0.045283+0.002717 test-logloss:0.631048+0.092421
## [72] train-logloss:0.044272+0.002583 test-logloss:0.634337+0.092077
## [73] train-logloss:0.043631+0.002504 test-logloss:0.634253+0.092245
## [74] train-logloss:0.042993+0.002364 test-logloss:0.633787+0.092208
## [75] train-logloss:0.042196+0.002478 test-logloss:0.635289+0.091771
## [76] train-logloss:0.041289+0.002391 test-logloss:0.636420+0.091375
## [77] train-logloss:0.040616+0.002268 test-logloss:0.634997+0.091845
## [78] train-logloss:0.039842+0.002260 test-logloss:0.636639+0.092253
## [79] train-logloss:0.039181+0.002205 test-logloss:0.636972+0.093622
## [80] train-logloss:0.038519+0.002150 test-logloss:0.640835+0.092698
## [81] train-logloss:0.037860+0.002151 test-logloss:0.642254+0.093535
## [82] train-logloss:0.037427+0.002146 test-logloss:0.643002+0.094331
## [83] train-logloss:0.036773+0.002067 test-logloss:0.643898+0.094224
## [84] train-logloss:0.036281+0.002178 test-logloss:0.645239+0.095216
## [85] train-logloss:0.035751+0.002111 test-logloss:0.645972+0.094806
## [86] train-logloss:0.035313+0.002025 test-logloss:0.646102+0.095244
## [87] train-logloss:0.034747+0.002061 test-logloss:0.647250+0.095034
## [88] train-logloss:0.034342+0.002093 test-logloss:0.647530+0.095802
## [89] train-logloss:0.033796+0.001991 test-logloss:0.647775+0.094590
## [90] train-logloss:0.033463+0.002076 test-logloss:0.649119+0.094242
## [91] train-logloss:0.033007+0.002044 test-logloss:0.648761+0.093406
## [92] train-logloss:0.032470+0.001980 test-logloss:0.649214+0.093089
## [93] train-logloss:0.032085+0.001945 test-logloss:0.650466+0.095023
## [94] train-logloss:0.031636+0.001789 test-logloss:0.650337+0.096600
## [95] train-logloss:0.031314+0.001873 test-logloss:0.651095+0.095395
## [96] train-logloss:0.030904+0.001804 test-logloss:0.652770+0.097038
## [97] train-logloss:0.030469+0.001789 test-logloss:0.653858+0.097896
## [98] train-logloss:0.030078+0.001741 test-logloss:0.655069+0.097545
## [99] train-logloss:0.029673+0.001658 test-logloss:0.657145+0.097130
## [100] train-logloss:0.029300+0.001659 test-logloss:0.658557+0.097644
##best iteration =
xgb2 <- xgb.train(params = params, data = train_matrix, nrounds = 100, watchlist = list(val = test_matrix, train=train_matrix), print_every_n = 10, early_stop_round = 10, maximize = F , eval_metric = "error")
## [23:01:48] WARNING: amalgamation/../src/learner.cc:576:
## Parameters: { "early_stop_round" } might not be used.
##
## This could be a false alarm, with some parameters getting used by language bindings but
## then being mistakenly passed down to XGBoost core, or some parameter actually being used
## but getting flagged wrongly here. Please open an issue if you find any such cases.
##
##
## [1] val-error:0.340000 train-error:0.197761
## [11] val-error:0.360000 train-error:0.076493
## [21] val-error:0.360000 train-error:0.033582
## [31] val-error:0.380000 train-error:0.006530
## [41] val-error:0.366667 train-error:0.000000
## [51] val-error:0.363333 train-error:0.000000
## [61] val-error:0.376667 train-error:0.000000
## [71] val-error:0.376667 train-error:0.000000
## [81] val-error:0.383333 train-error:0.000000
## [91] val-error:0.373333 train-error:0.000000
## [100] val-error:0.370000 train-error:0.000000
xgb.prob <- predict(xgb2, test_matrix)
xgb.pred <- ifelse(xgb.prob > 0.5, 1, 0)
confusionMatrix(data = as.factor(xgb.pred), reference = testing_label, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 22 73
## 1 38 167
##
## Accuracy : 0.63
## 95% CI : (0.5726, 0.6848)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.0513
##
## Mcnemar's Test P-Value : 0.00125
##
## Sensitivity : 0.6958
## Specificity : 0.3667
## Pos Pred Value : 0.8146
## Neg Pred Value : 0.2316
## Prevalence : 0.8000
## Detection Rate : 0.5567
## Detection Prevalence : 0.6833
## Balanced Accuracy : 0.5312
##
## 'Positive' Class : 1
##
test.rf.roc <- roc(testing$action_taken ~ xgb.prob, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

mat2 <- xgb.importance(feature_names = colnames(training_sparse), model = xgb2)
xgb.plot.importance(importance_matrix = mat2)

SVM
library(e1071)
tune.out <- e1071::tune(svm,action_taken1 ~ ., data = training.bal.dummy, kernel = "linear", ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
# extract the best model
(bestmod <- tune.out$best.model)
##
## Call:
## best.tune(method = svm, train.x = action_taken1 ~ ., data = training.bal.dummy,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: linear
## cost: 0.001
## gamma: 0.03846154
## epsilon: 0.1
##
##
## Number of Support Vectors: 1022
svmfit = e1071::svm(action_taken1 ~ ., data = training.bal.dummy, kernel = "linear", cost = 0.001, scale = FALSE)
probs <- predict(svmfit, testing.dummy)
preds <- as.numeric(ifelse(probs > 0.5, 1, 0))
confusionMatrix(data = as.factor(preds), reference = as.factor(testing.dummy$action_taken1), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 44 133
## 1 16 107
##
## Accuracy : 0.5033
## 95% CI : (0.4453, 0.5613)
## No Information Rate : 0.8
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1035
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4458
## Specificity : 0.7333
## Pos Pred Value : 0.8699
## Neg Pred Value : 0.2486
## Prevalence : 0.8000
## Detection Rate : 0.3567
## Detection Prevalence : 0.4100
## Balanced Accuracy : 0.5896
##
## 'Positive' Class : 1
##
test.roc <- roc(testing.dummy$action_taken1 ~ probs, plot = TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
